
library(dplyr)
library(data.table)
library(scales)

cohort <- fread("H:/Zheng_10223/Joint/cohort_2025.csv")


# get counts of occupations:

dftab=cohort %>% group_by(NOC2_CD11_Main) %>% summarize(count=n())


tmp <- cohort[!NOC2_CD11_Main=="-", .(N=length(IMDB_ID), 
                         mean=mean(MainParent_Income_HH_MainParentAge45_49), 
                         median=quantile(MainParent_Income_HH_MainParentAge45_49, 0.5),
                         p25=quantile(MainParent_Income_HH_MainParentAge45_49, 0.25),
                         p75=quantile(MainParent_Income_HH_MainParentAge45_49, 0.75)
                         ), by="NOC2_CD11_Main"]

tmp$NOC2_CD11 <- as.numeric(tmp$NOC2_CD11_Main)


tmp <- tmp[order(NOC2_CD11_Main)]



par(mar=c(4.5, 4.7, 2, 4.5))
plot(x=1:41, y=tmp$N, type="h", lwd=15, col="grey90", lend=4, yaxt="n", ylab="", xaxt="n", xlab="")
axis(4, at=c(0,10000,20000,30000), c(0,10000,20000,30000), cex.axis=1.5, cex.lab=1.5)
par(new=TRUE)
plot(y=tmp$mean, x=1:41, pch=19, ylim=c(20000,270000), xaxt="n", xlab="Intended Occupation", ylab="Actual Household Income Ages 45-49", cex.axis=1.5, cex.lab=1.5)
axis(1, at=1:41, tmp$NOC2_CD11, cex.axis=1.5, cex.lab=1.5)
points(y=tmp$median, x=1:41, pch=21)
arrows(x0=1:41, x1=1:41, y0=tmp$p25, y1=tmp$p75, length=0, angle=90)
mtext(side=4, line=3, "Number of Immigrants", cex=1.5)


# get counts 
write.csv(tmp[,c("NOC2_CD11_Main","N")],"H:/Zheng_10223/ToVet/Supporting/Figure_IntendedOccupation_counts.csv")


library(haven)
cens1991 <- read_dta("H:/Zheng_10223/Joint/Census/census1991_occdata.dta")
cens1996 <- read_dta("H:/Zheng_10223/Joint/Census/census1996_occdata.dta")
cens2001 <- read_dta("H:/Zheng_10223/Joint/Census/census2001_occdata.dta")
cens2006 <- read_dta("H:/Zheng_10223/Joint/Census/census2006_occdata.dta")
cens2011 <- read_dta("H:/Zheng_10223/Joint/Census/census2011_occdata.dta")
allcens <- rbind(cens1991, cens1996, cens2001,cens2006,cens2011)




# cpi 
cpi <- read.csv("H:/Zheng_10223/Joint/cpi/cpi.csv")
allcens <- merge(x=allcens, y=cpi, by.x="year", by.y="year", all.x=T)
allcens$totinc_real <- allcens$totinc*cpi$cpi[which(cpi$year==2020)]/allcens$cpi
allcens$hhinc_real <- allcens$hhinc*cpi$cpi[which(cpi$year==2020)]/allcens$cpi


# drop less than zero income: 
allcens=allcens %>% filter(hhinc_real>=0)


# winsorize:
allcens$totinc_real[which(allcens$totinc_real>=quantile(allcens$totinc_real, 0.999, na.rm=T))] <- quantile(allcens$totinc_real, 0.999, na.rm=T)
allcens$hhinc_real[which(allcens$hhinc_real>=quantile(allcens$hhinc_real, 0.999, na.rm=T))] <- quantile(allcens$hhinc_real, 0.999, na.rm=T)



library(lfe)
model <- felm(hhinc_real~0|noc2011, data=allcens)
census <- getfe(model)[c("effect")]
census$noc2011 <- rownames(census)
census$noc2011 <- substring(census$noc2011, 9, 11)

model <- felm(MainParent_Income_HH_MainParentAge45_49~0|NOC2_CD11_Main, data=cohort)
immigrant <- getfe(model)[c("effect", "obs")]
immigrant$NOC2_CD11 <- rownames(immigrant)
immigrant$NOC2_CD11 <- substring(immigrant$NOC2_CD11, 16, 17)

immigrant <- merge(x=immigrant, y=census, by.x="NOC2_CD11", by.y="noc2011")



par(mar=c(4.5, 5, 2, 2))
plot(immigrant[,c("effect.x", "effect.y")], cex=immigrant$obs/2000, pch=19, col=alpha("black", 0.2), xlab="Average Realized Immigrant Household Income (ages 45-49)", ylab="Average Household Income among Canadian Born", cex.lab=1.5, cex.axis=1.5)
text(x=immigrant$effect.x, y=immigrant$effect.y, immigrant$NOC2_CD11, cex=1.2)
abline(lm(immigrant[,c("effect.y", "effect.x")]), lwd=2, col="red")
text(x=160000, y=100000, paste("Correlation=", round(cor(immigrant$effect.x, immigrant$effect.y), 3), sep=""), cex=1.3, col="red")
